Load R packages and define colour functions

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(colorspace)
library(GGally) ; library(ggpubr) ; library(ggExtra)
library(WGCNA)
library(expss)
library(polycor)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(knitr) ; library(kableExtra)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

# Get colors from the ggplot palette
gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n+1)
  pal = hcl(h = hues, l = 65, c = 100)[1:n]
}

# Assign an HCL rainbow colour to each module
get_mod_colours = function(mods){
  
  n = length(unique(mods))-1
  set.seed(123) ; rand_order = sample(1:n)
  mod_colors = c('white', gg_colour_hue(n)[rand_order])
  names(mod_colors) = mods %>% table %>% names
  
  return(mod_colors)
}

Load preprocessed dataset (code in 2.1.Preprocessing_pipeline.Rmd)

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')

# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame

# WGCNA metrics
WGCNA_metrics = read.csv('./../Data/preprocessedData/WGCNA_metrics.csv')

# Updates genes_info with SFARI information and clusters
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>% 
             left_join(datGenes %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, hgnc_symbol), by = 'ID') %>%
             dplyr::select(ID, hgnc_symbol, log2FoldChange, shrunken_log2FoldChange, significant, Neuronal) %>%
             left_join(WGCNA_metrics, by = 'ID') %>% dplyr::select(-contains('pval'))


rm(dds, WGCNA_metrics)




3.3.1 Top clusters by Cluster-diagnosis correlation


Selecting the top clusters

Modules with a cluster-diagnosis correlation manitude larger than 0.9

plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor) %>% distinct %>% 
            mutate(alpha = ifelse(abs(MTcor)>0.9, 1, 0.3))

top_modules = plot_data %>% arrange(desc(MTcor)) %>% filter(abs(MTcor) > 0.9) %>% pull(Module) %>% as.character

The 3 modules that fulfill this condition are clusters 20, 36, 45

ggplotly(plot_data %>% mutate(module_number=ifelse(module_number == max(module_number), 'No cluster', 
                                                   paste('Cluster', module_number))) %>%
         ggplot(aes(reorder(module_number, -MTcor), MTcor)) + 
         geom_bar(stat='identity', fill = plot_data$Module, alpha = plot_data$alpha) + 
         geom_hline(yintercept =c(0.9, -0.9), color = 'gray', linetype = 'dotted') + 
         xlab('Clusters')+ ylab('Cluster-diagnosis correlation') + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90, hjust = 1)))

The modules consist mainly of points with high (absolute) values in PC2 (which we know is related to lfc), so this result is consistent with the high correlation between Module and Diagnosis, although some of the points with the highest PC2 values do not belong to these top modules

The genes belonging to the modules with the positive Module-Diagnosis correlation have positive LFC values and the genes belonging to the modules with the negative Module-Diagnosis correlation have negative values.

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
                   paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
            mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
            apply_labels(ImportantModules_number = 'Top Clusters')

cro(plot_data$ImportantModules_number)
 #Total 
 Top Clusters 
   Cluster 20  562
   Cluster 36  623
   Cluster 45  97
   Others  14835
   #Total cases  16117
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
         ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)


Module Eigengenes


Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control

In all cases, the Eigengenes separate the behaviour between autism and control samples very clearly (p-value < \(10^{-4}\)). Modules with positive Module-Diagnosis correlation have higher eigengenes in the ASD samples and Modules with a negative correlation, in the Control samples

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)], 
                         'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + 
      geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
      ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], 
                     '  (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) + 
      stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
                        ref.group = 'ASD') +
      ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
  
  return(p)
}


# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])

grid.arrange(p1, p2, p3, nrow=1)

rm(plot_EGs, ME_object, MEs, p1, p2, p3)


Identifying representative genes for each Module


In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same

SFARI Genes don’t seem to be more representative than the rest of the genes

create_plot = function(module){
  
  plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>% 
              filter(genes_info$Module==module)
  colnames(plot_data)[2] = 'MM'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
  
  p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
                                                        as.character(gene.score))) %>% 
               ggplot(aes(MM, GS, color=gene.score)) +
               geom_point(alpha=0.5, aes(ID=ID)) +  xlab('Cluster Membership') + ylab('Gene Significance') +
               ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], '  (MTcor = ', 
                              round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
  
  # For thesis
  # p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
  #                                              ifelse(gene.score=='Neuronal', as.character(gene.score), 
  #                                                     paste('Score',as.character(gene.score))))) %>% 
  #     mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal', 
  #                                                       'Not in SFARI')),
  #            alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
  #     ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +  
  #     xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
  #     scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() + 
  #     labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
  # if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)

Top 10 genes for each module


Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t that many SFARI genes in the top genes of the modules

create_table = function(module){
  top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% 
              filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.',gsub('#','',module))) %>% 
              mutate(Relevance = (MM+abs(GS))/2, 
                     gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>% 
              arrange(by=-Relevance) %>% top_n(10) %>% 
              dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 36 (MTcor = 0.97)
Gene Symbol MM GS SFARI Score Relevance
MCL1 0.8325746 0.9067802 Not in SFARI 0.8696774
ITGA5 0.8372503 0.8588163 Not in SFARI 0.8480333
PXN 0.8158056 0.8682115 Not in SFARI 0.8420086
SRGAP1 0.7298273 0.9211833 Not in SFARI 0.8255053
CFLAR 0.8063163 0.8155733 Not in SFARI 0.8109448
RREB1 0.7626458 0.8430492 Not in SFARI 0.8028475
MYOF 0.7685244 0.8326888 Not in SFARI 0.8006066
SERPINE1 0.8039283 0.7893995 3 0.7966639
OLFML2B 0.7317113 0.8526315 Not in SFARI 0.7921714
LATS2 0.7354979 0.8487660 Not in SFARI 0.7921319
kable(top_genes[[2]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 45 (MTcor = 0.92)
Gene Symbol MM GS SFARI Score Relevance
EPS8 0.7997752 0.8310258 Not in SFARI 0.8154005
YAP1 0.7535539 0.8480815 Not in SFARI 0.8008177
TOB2 0.7257486 0.8677757 Not in SFARI 0.7967622
FOXO1 0.7358202 0.8502902 Neuronal 0.7930552
PARD3B 0.7719367 0.8139975 2 0.7929671
CLIC1 0.7415732 0.7915367 Not in SFARI 0.7665550
ACKR3 0.7658517 0.7522397 Not in SFARI 0.7590457
FAM129B 0.6339106 0.8683425 Not in SFARI 0.7511266
IL1R1 0.7554235 0.7442369 Not in SFARI 0.7498302
SLC16A9 0.7597334 0.7022276 Not in SFARI 0.7309805
kable(top_genes[[3]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[3]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 20 (MTcor = -0.92)
Gene Symbol MM GS SFARI Score Relevance
RNF157 0.8841689 -0.8478223 Not in SFARI 0.8659956
LIMK1 0.8662292 -0.8615976 Neuronal 0.8639134
MAP3K9 0.8066583 -0.8862639 Not in SFARI 0.8464611
CPLX1 0.8479632 -0.8259295 Neuronal 0.8369463
PNKD 0.8356434 -0.8331841 Not in SFARI 0.8344138
VAMP1 0.8505980 -0.8010863 Neuronal 0.8258422
SHD 0.7828133 -0.8622930 Not in SFARI 0.8225532
KATNB1 0.8528400 -0.7891769 Neuronal 0.8210084
UBE2M 0.7887791 -0.8492042 Neuronal 0.8189916
LYNX1 0.8177942 -0.8009674 Not in SFARI 0.8093808
rm(create_table, i)

Most of the top genes, idenpendently of the cluster, have very high absolute values for the 2nd principal component

pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol, 
                       'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
            mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
                                  ifelse(color %in% top_modules, 0.25, 0.1)))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
                        color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
              theme_minimal() + ggtitle('Most relevant genes for top clusters')

rm(pca, tg, plot_data)


30/30 genes are differentially expressed

2/30 genes are differentially expressed (SERPINE1, PARD3B)

Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(Dissected_Sample_ID, Diagnosis),
                        by = c('variable'='Dissected_Sample_ID')) %>% arrange(desc(Relevance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
                                    levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      ggtitle(paste0('Top genes for cluster ', 
                                     genes_info$module_number[genes_info$Module==top_modules[i]][1], 
                                     ' (MTcor = ',
                      round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) + 
                      xlab('') + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  
  # For thesis
  # p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
  #                                   levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
  #     ggplot(aes(external_gene_id, value, fill=Diagnosis)) + 
  #     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
  #     xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
  # if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
  
  return(p)
  
}

create_plot(1)
create_plot(2)
create_plot(3)
rm(create_plot)





3.3.2 Top clusters by enrichment in SFARI Genes


Selecting the top clusters

Using ORA, as it was done in the previous section and selecting as top clusters the ones with an adjusted p-value lower than \(10^{-3}\) (enrichment higher than 0.999)

# Calculate % and ORA of SFARI Genes in each module

#modules = unique(genes_info$Module[genes_info$Module!='gray']) %>% as.character
modules = unique(genes_info$Module) %>% as.character

# We need the entrez ID of the genes for this
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', 
               host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                       values=genes_info$ID, mart=mart) %>%
                 left_join(genes_info %>% dplyr::select(ID, Module,gene.score), by = c('ensembl_gene_id'='ID'))

# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>% mutate(term = ifelse(gene.score == 'Others', 'Others', 'SFARI'), 
                                      'gene' = entrezgene) %>%  dplyr::select(term, gene) %>% distinct


enrichment_data = data.frame('Module' = modules, 'size' = 0, 'perc_SFARI' = 0,
                             'pval_ORA' = 0, 'padj_ORA' = 0)

for(i in 1:length(modules)){
  module = modules[i]
  genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
  ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character, 
                        pAdjustMethod = 'bonferroni', TERM2GENE = term2gene, 
                        pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 50000) %>% 
                        data.frame %>% dplyr::select(-geneID,-Description)
  ORA_pval = ifelse('SFARI' %in% ORA_module$ID, ORA_module$pvalue[ORA_module$ID=='SFARI'], 1)
  ORA_padj = ifelse('SFARI' %in% ORA_module$ID, ORA_module$p.adjust[ORA_module$ID=='SFARI'], 1)
  enrichment_data[i,-1] = c(length(genes_in_module), 
                            mean(genes_info$gene.score[genes_info$Module==module]!='Others'), 
                            ORA_pval, ORA_padj)
}

enrichment_data = enrichment_data %>% 
                  left_join(genes_info %>% dplyr::select(Module) %>% unique, by = 'Module')

genes_info = genes_info %>% left_join(enrichment_data, by = 'Module') 

plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor, size, padj_ORA) %>% distinct %>% 
            mutate(alpha = ifelse(abs(1-padj_ORA)>0.999, 1, 0.3))

top_modules = plot_data %>% arrange(padj_ORA) %>% filter((1-padj_ORA) > 0.999) %>% pull(Module) %>% as.character

rm(i, module, genes_in_module, ORA_module, ORA_pval, ORA_padj, getinfo, mart, term2gene)

The 3 modules that fulfill this condition are clusters 7, 22, 39

ggplotly(plot_data %>% ggplot(aes(MTcor, padj_ORA, size=size)) + 
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=module_number)) +
         geom_hline(yintercept = 0.001, color = 'gray', linetype = 'dotted') + 
         xlab('Cluster-diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
         theme_minimal() + theme(legend.position = 'none') +
         ggtitle('Clusters Significantly Enriched in SFARI Genes'))
pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
                   paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
            mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
            apply_labels(ImportantModules_number = 'Top Clusters')

cro(plot_data$ImportantModules_number)
 #Total 
 Top Clusters 
   Cluster 22  67
   Cluster 39  360
   Cluster 7  1504
   Others  14186
   #Total cases  16117
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
         xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
         ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
         ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)


Module Eigengenes


In all cases, the Eigengenes separate the behaviour between autism and control samples very clearly (p-value < \(10^{-4}\)). Modules with positive Module-Diagnosis correlation have higher eigengenes in the ASD samples and Modules with a negative correlation, in the Control samples

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)], 
                         'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + 
      geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
      ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], 
                     '  (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) + 
      stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
                        ref.group = 'ASD') +
      ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
  
  return(p)
}


# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])

grid.arrange(p1, p2, p3, nrow=1)

rm(plot_EGs, ME_object, MEs, p1, p2, p3)



Identifying representative genes for each Module


In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same

SFARI Genes don’t seem to be more representative than the rest of the genes

create_plot = function(module){
  
  plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>% 
              filter(genes_info$Module==module)
  colnames(plot_data)[2] = 'MM'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
  
  p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI', 
                                                        as.character(gene.score))) %>% 
               ggplot(aes(MM, GS, color=gene.score)) +
               geom_point(alpha=0.5, aes(ID=ID)) +  xlab('Cluster Membership') + ylab('Gene Significance') +
               ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], '  (MTcor = ', 
                              round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
  
  # For thesis
  # p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI',
  #                                              ifelse(gene.score=='Neuronal', as.character(gene.score),
  #                                                     paste('Score',as.character(gene.score))))) %>%
  #     mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal',
  #                                                       'Not in SFARI')),
  #            alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
  #     ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +
  #     xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
  #     scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() +
  #     labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
  # if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)

Top 10 genes for each module


Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t that many SFARI genes in the top genes of the modules

create_table = function(module){
  top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% 
              filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.', gsub('#','',module))) %>% 
              mutate(Relevance = (MM+abs(GS))/2, 
                     gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>% 
              arrange(by=-Relevance) %>% top_n(10) %>% 
              dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 7 (MTcor = -0.9)
Gene Symbol MM GS SFARI Score Relevance
MAPK9 0.8968422 -0.9392328 Neuronal 0.9180375
TRIM37 0.8936259 -0.9293330 Not in SFARI 0.9114795
PREPL 0.8907183 -0.9035408 Not in SFARI 0.8971295
NAP1L5 0.8751900 -0.9036163 Not in SFARI 0.8894031
TTBK2 0.9009532 -0.8742880 Not in SFARI 0.8876206
EIF5A2 0.8516030 -0.9108883 Not in SFARI 0.8812457
PRKCE 0.8446185 -0.9076184 Neuronal 0.8761185
DIRAS1 0.8525683 -0.8958339 Not in SFARI 0.8742011
ATP6V1C1 0.8400817 -0.8957404 Not in SFARI 0.8679110
NAPB 0.9029772 -0.8290573 Not in SFARI 0.8660173
kable(top_genes[[2]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 39 (MTcor = -0.55)
Gene Symbol MM GS SFARI Score Relevance
FN3K 0.8030437 -0.8000068 Not in SFARI 0.8015253
KIF1A 0.8342445 -0.7543228 Neuronal 0.7942837
CELSR2 0.8551589 -0.7116076 Neuronal 0.7833833
STX1B 0.7924147 -0.7626449 Neuronal 0.7775298
EPB41L1 0.8101531 -0.7304887 Not in SFARI 0.7703209
PRKACA 0.8795831 -0.6594748 Neuronal 0.7695290
SYN1 0.8678907 -0.6406826 3 0.7542866
CAMTA2 0.8069597 -0.6980794 Not in SFARI 0.7525196
WNK2 0.7500853 -0.7517314 Not in SFARI 0.7509084
MARCH6 0.7877729 -0.6990101 Not in SFARI 0.7433915
kable(top_genes[[3]] %>% dplyr::select(-ID), 
      caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1], 
      '  (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[3]][1],2),')')) %>% 
      kable_styling(full_width = F)
Top 10 genes for Module 22 (MTcor = 0.55)
Gene Symbol MM GS SFARI Score Relevance
SPRY2 0.7566443 0.5560984 Neuronal 0.6563714
KIAA1549 0.7517636 0.5504036 Not in SFARI 0.6510836
TET3 0.8120406 0.4852710 Not in SFARI 0.6486558
DNAJB5 0.7399456 0.5226353 Not in SFARI 0.6312905
DUSP6 0.8448647 0.4085318 Neuronal 0.6266982
GAREM 0.6976841 0.5472754 Not in SFARI 0.6224798
TRIM9 0.7593863 0.4835355 Not in SFARI 0.6214609
EGR2 0.7291357 0.4856175 Neuronal 0.6073766
TRIB1 0.6993757 0.5113890 Not in SFARI 0.6053823
KDM5B 0.6164204 0.5860347 2 0.6012275
rm(create_table, i)

Top genes don’t have PC2 values as high as they did with the top genes by cluster-diagnosis correlation

pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol, 
                       'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
            mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
                                  ifelse(color %in% top_modules, 0.25, 0.1)))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
                        color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
              theme_minimal() + ggtitle('Most relevant genes for top clusters')

rm(pca, tg, plot_data)


30/30 genes are differentially expressed

2/30 genes are differentially expressed (SYN1, KDM5B)

Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes, but it’s not as strong as the differences in the top cluster by cluster-diagnosis correlation

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(Dissected_Sample_ID, Diagnosis),
                        by = c('variable'='Dissected_Sample_ID')) %>% arrange(desc(Relevance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`, 
                                    levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      ggtitle(paste0('Top genes for cluster ', 
                                     genes_info$module_number[genes_info$Module==top_modules[i]][1], 
                                     ' (MTcor = ',
                      round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) + 
                      xlab('') + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  
  # # For the thesis
  # p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`,
  #                                   levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
  #     ggplot(aes(external_gene_id, value, fill=Diagnosis)) +
  #     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
  #     xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
  # if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
  
  return(p)
  
}

create_plot(1)
create_plot(2)
create_plot(3)
rm(create_plot)




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0       knitr_1.32             org.Hs.eg.db_3.8.2    
##  [4] AnnotationDbi_1.46.1   IRanges_2.18.3         S4Vectors_0.22.1      
##  [7] Biobase_2.44.0         BiocGenerics_0.30.0    DOSE_3.10.2           
## [10] ReactomePA_1.28.0      clusterProfiler_3.12.0 biomaRt_2.40.5        
## [13] polycor_0.7-10         expss_0.10.2           WGCNA_1.69            
## [16] fastcluster_1.1.25     dynamicTreeCut_1.63-1  ggExtra_0.9           
## [19] ggpubr_0.2.5           magrittr_2.0.1         GGally_1.5.0          
## [22] colorspace_2.0-2       gridExtra_2.3          viridis_0.5.1         
## [25] viridisLite_0.4.0      RColorBrewer_1.1-2     dendextend_1.13.4     
## [28] plotly_4.9.2           glue_1.4.2             reshape2_1.4.4        
## [31] forcats_0.5.0          stringr_1.4.0          dplyr_1.0.1           
## [34] purrr_0.3.4            readr_1.3.1            tidyr_1.1.0           
## [37] tibble_3.1.2           ggplot2_3.3.5          tidyverse_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.1                  tidyselect_1.1.0           
##   [3] RSQLite_2.2.0               htmlwidgets_1.5.1          
##   [5] grid_3.6.3                  BiocParallel_1.18.1        
##   [7] munsell_0.5.0               codetools_0.2-16           
##   [9] preprocessCore_1.46.0       miniUI_0.1.1.1             
##  [11] withr_2.4.2                 GOSemSim_2.10.0            
##  [13] highr_0.8                   rstudioapi_0.11            
##  [15] ggsignif_0.6.0              labeling_0.4.2             
##  [17] urltools_1.7.3              GenomeInfoDbData_1.2.1     
##  [19] polyclip_1.10-0             bit64_4.0.5                
##  [21] farver_2.1.0                vctrs_0.3.8                
##  [23] generics_0.0.2              xfun_0.22                  
##  [25] GenomeInfoDb_1.20.0         R6_2.5.0                   
##  [27] doParallel_1.0.15           graphlayouts_0.7.0         
##  [29] locfit_1.5-9.4              DelayedArray_0.10.0        
##  [31] bitops_1.0-6                reshape_0.8.8              
##  [33] fgsea_1.10.1                gridGraphics_0.5-0         
##  [35] assertthat_0.2.1            promises_1.2.0.1           
##  [37] scales_1.1.1                ggraph_2.0.3               
##  [39] nnet_7.3-14                 enrichplot_1.4.0           
##  [41] gtable_0.3.0                tidygraph_1.2.0            
##  [43] rlang_0.4.11                genefilter_1.66.0          
##  [45] splines_3.6.3               lazyeval_0.2.2             
##  [47] acepack_1.4.1               impute_1.58.0              
##  [49] broom_0.7.0                 europepmc_0.4              
##  [51] checkmate_2.0.0             BiocManager_1.30.10        
##  [53] yaml_2.2.1                  modelr_0.1.6               
##  [55] crosstalk_1.1.0.1           backports_1.1.8            
##  [57] httpuv_1.6.1                qvalue_2.16.0              
##  [59] Hmisc_4.4-0                 tools_3.6.3                
##  [61] ggplotify_0.0.5             ellipsis_0.3.2             
##  [63] jquerylib_0.1.4             ggridges_0.5.2             
##  [65] Rcpp_1.0.6                  plyr_1.8.6                 
##  [67] zlibbioc_1.30.0             base64enc_0.1-3            
##  [69] progress_1.2.2              RCurl_1.98-1.2             
##  [71] prettyunits_1.1.1           rpart_4.1-15               
##  [73] cowplot_1.1.1               SummarizedExperiment_1.14.1
##  [75] haven_2.2.0                 ggrepel_0.8.2              
##  [77] cluster_2.1.0               fs_1.5.0                   
##  [79] data.table_1.12.8           DO.db_2.9                  
##  [81] triebeard_0.3.0             reprex_0.3.0               
##  [83] reactome.db_1.68.0          matrixStats_0.56.0         
##  [85] hms_0.5.3                   mime_0.11                  
##  [87] evaluate_0.14               xtable_1.8-4               
##  [89] XML_3.99-0.3                jpeg_0.1-8.1               
##  [91] readxl_1.3.1                compiler_3.6.3             
##  [93] crayon_1.4.1                htmltools_0.5.1.1          
##  [95] later_1.2.0                 Formula_1.2-3              
##  [97] geneplotter_1.62.0          lubridate_1.7.4            
##  [99] DBI_1.1.0                   tweenr_1.0.1               
## [101] dbplyr_1.4.2                MASS_7.3-53                
## [103] rappdirs_0.3.3              Matrix_1.2-18              
## [105] cli_2.5.0                   igraph_1.2.5               
## [107] GenomicRanges_1.36.1        pkgconfig_2.0.3            
## [109] rvcheck_0.1.8               foreign_0.8-76             
## [111] xml2_1.2.5                  foreach_1.5.0              
## [113] annotate_1.62.0             bslib_0.2.5.1              
## [115] XVector_0.24.0              webshot_0.5.2              
## [117] rvest_0.3.5                 digest_0.6.27              
## [119] graph_1.62.0                rmarkdown_2.7              
## [121] cellranger_1.1.0            fastmatch_1.1-0            
## [123] htmlTable_1.13.3            curl_4.3                   
## [125] shiny_1.6.0                 graphite_1.30.0            
## [127] lifecycle_1.0.0             jsonlite_1.7.2             
## [129] fansi_0.5.0                 pillar_1.6.1               
## [131] lattice_0.20-41             fastmap_1.1.0              
## [133] httr_1.4.2                  survival_3.2-7             
## [135] GO.db_3.8.2                 UpSetR_1.4.0               
## [137] png_0.1-7                   iterators_1.0.12           
## [139] bit_4.0.4                   ggforce_0.3.1              
## [141] stringi_1.5.3               sass_0.4.0                 
## [143] blob_1.2.1                  DESeq2_1.24.0              
## [145] latticeExtra_0.6-29         memoise_1.1.0